/* This dofile is to merge the list of grade 10 repreaters shared by MoE
with our grade 10 database */


// Open the list of repeaters 
import excel using $repeat10, clear firstrow

g indexnumber = _n

replace name = trim(itrim(lower(name)))
replace name = subinstr(name," ","",.)  
replace name = subinstr(name,",","",.)   //Removes comma (,)
replace name = subinstr(name,"'","",.)   //Removes apostrophe (')
replace name = subinstr(name,".","",.)   //Removes dot (.) 
replace name = subinstr(name,"/","",.)   //Removes slash (/)
replace name = subinstr(name,"-","",.)   //Removes dash (-)
replace name = subinstr(name,"=","",.)   //Removes dash (-)
replace name = subinstr(name,"(","",.)   //Removes opening parentheses
replace name = subinstr(name,")","",.)   //Removes closing parentheses
rename name name_r

rename student_code studentid_r

keep name_r studentid_r indexnumber

save "$temp/repeat_grade10", replace 


********************************************************************************
*
*			MERGE WITH GRADE 12 DATABASE  
*
********************************************************************************

// STEP 1: PERFECT MATCHING ON STUDENT ID + DOUBLE CHECK NAME 
********************************************************************************
gl criteria studentid  

* open the tvet choice data 
use "$temp/repeat_grade10.dta", clear 
rename studentid_r studentid 

duplicates tag $criteria, g(temp)
tab temp

* obtain the subset of unique citizenid
preserve 
keep if temp==0
drop temp
save $temp/tvet_step1u, replace 
restore 

* keep the duplicated for next round match
keep if temp~=0
drop temp
cap rename studentid studentid_r
save $temp/tvet_step1d, replace


* open the survey data 
use "$temp/stem_analysis.dta", clear
replace name = subinstr(name," ","",.)  
replace sex = 2 if sex == 0 
format citizenid %20s

generate str citizenid_st = citizenid
replace citizenid = ""
compress citizenid
replace citizenid = citizenid_st
drop citizenid_st
describe citizenid

save "$temp/data10.dta", replace 


use "$temp/data10.dta", clear
rename studentid_bl studentid 

duplicates tag $criteria, g(temp)

* obtain the subset of unique citizenid
preserve 
keep if temp==0
drop temp
save $temp/data10_step1u, replace 
restore 

* keep the duplicated for next round match
keep if temp~=0
drop temp
rename studentid studentid_bl 
save $temp/data10_step1d, replace

* merging two unique data together using fully matched student ID & check fuzzy name 
use $temp/tvet_step1u, clear
merge 1:1 $criteria using $temp/data10_step1u
matchit name_r name, g(namescore)
g name2 = subinstr(name_original," ","",.)  
matchit name_r name2, g(namescore2)
sum namescore if _merge==3
br name name2 name_r if _merge==3

replace _merge = . if namescore < 0.5 & namescore2 < 0.5

keep if _merge==3 
keep indexnumber uniqueid 
unique indexnumber 
save $temp/repeat_step1m, replace 


// COMBINE DATA TOGETHER 
use $temp/repeat_step1m, clear
merge 1:1 indexnumber using "$temp/repeat_grade10.dta", keep(matched) nogen
keep uniqueid

g r_repeat_match_criteria = "perfect(studentID) + fuzzy(name)"
g r_repeat_data = 1  

lab var r_repeat_data "repeat grade 10 (administrative data)"

g science_stream = 0

save "$temp/grade10_analysis_repeat.dta", replace 





